solr/spelling/WordBreakSolrSpellChecker.java

package org.apache.solr.spelling;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.CombineSuggestion;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.WordBreakSpellChecker;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;

/**
 * <p>
 * A spellchecker that breaks and combines words.
 * </p>
 * <p>
 * This will not combine adjacent tokens that do not have
 * the same required status (prohibited, required, optional).
 * However, this feature depends on incoming term flags
 * being properly set. ({@link QueryConverter#PROHIBITED_TERM_FLAG},
 * {@link QueryConverter#REQUIRED_TERM_FLAG},
 * {@link QueryConverter#TERM_IN_BOOLEAN_QUERY_FLAG}, and
 * {@link QueryConverter#TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG} )
 * This feature breaks completely if the upstream analyzer or query
 * converter sets flags with the same values but different meanings.
 * The default query converter (if not using "spellcheck.q")
 * is {@link SpellingQueryConverter}, which properly sets these flags.
 * </p>
 */
public class WordBreakSolrSpellChecker extends SolrSpellChecker {
  /**
   * <p>
   * Try to combine multiple words into one? [true|false]
   * </p>
   */
  public static final String PARAM_COMBINE_WORDS = "combineWords";
  /**
   * <p>
   * Try to break words into multiples? [true|false]
   * </p>
   */
  public static final String PARAM_BREAK_WORDS = "breakWords";
  /**
   * See {@link WordBreakSpellChecker#setMaxChanges}
   */
  public static final String PARAM_MAX_CHANGES = "maxChanges";
  /**
   * See {@link WordBreakSpellChecker#setMaxCombineWordLength}
   */
  public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength";
  /**
   * See {@link WordBreakSpellChecker#setMinBreakWordLength}
   */
  public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";
  /**
   * See {@link BreakSuggestionTieBreaker} for options.
   */
  public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSugestionTieBreaker";
  /**
   * See {@link WordBreakSpellChecker#setMaxEvaluations}
   */
  public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";
  /**
   * See {@link WordBreakSpellChecker#setMinSuggestionFrequency}
   */
  public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq";

  /**
   * <p>
   *  Specify a value on the "breakSugestionTieBreaker" parameter.
   *    The default is MAX_FREQ.
   * </p>
   */
  public enum BreakSuggestionTieBreaker {
    /**
     * See
     * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
     * #
     */
    MAX_FREQ,
    /**
     * See
     * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_SUMMED_FREQUENCY}
     */
    SUM_FREQ
  };

  private WordBreakSpellChecker wbsp = null;
  private boolean combineWords = false;
  private boolean breakWords = false;
  private BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
  private static final Pattern spacePattern = Pattern.compile("\\s+");

  @Override
  public String init(@SuppressWarnings("unchecked") NamedList config,
      SolrCore core) {
    String name = super.init(config, core);
    combineWords = boolParam(config, PARAM_COMBINE_WORDS);
    breakWords = boolParam(config, PARAM_BREAK_WORDS);
    wbsp = new WordBreakSpellChecker();
    String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
    if (bstb != null) {
      bstb = bstb.toUpperCase(Locale.ROOT);
      if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
        sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY;
      } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) {
        sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
      } else {
        throw new IllegalArgumentException("Invalid value for parameter "
            + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb);
      }
    } else {
      sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
    }
    int mc = intParam(config, PARAM_MAX_CHANGES);
    if (mc > 0) {
      wbsp.setMaxChanges(mc);
    }
    int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH);
    if (mcl > 0) {
      wbsp.setMaxCombineWordLength(mcl);
    }
    int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH);
    if (mbwl > 0) {
      wbsp.setMinBreakWordLength(mbwl);
    }
    int me = intParam(config, PARAM_MAX_EVALUATIONS);
    if (me > 0) {
      wbsp.setMaxEvaluations(me);
    }
    int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY);
    if (msf > 0) {
      wbsp.setMinSuggestionFrequency(msf);
    }
    return name;
  }

  private String strParam(@SuppressWarnings("unchecked") NamedList config,
      String paramName) {
    Object o = config.get(paramName);
    return o == null ? null : o.toString();
  }

  private boolean boolParam(@SuppressWarnings("unchecked") NamedList config,
      String paramName) {
    String s = strParam(config, paramName);
    if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) {
      return true;
    }
    return false;
  }

  private int intParam(@SuppressWarnings("unchecked") NamedList config,
      String paramName) {
    Object o = config.get(paramName);
    if (o == null) {
      return 0;
    }
    try {
      return Integer.parseInt(o.toString());
    } catch (NumberFormatException nfe) {
      throw new IllegalArgumentException("Invalid integer for parameter "
          + paramName + " : " + o);
    }
  }

  @Override
  public SpellingResult getSuggestions(SpellingOptions options)
      throws IOException {
    IndexReader ir = options.reader;
    int numSuggestions = options.count;

    StringBuilder sb = new StringBuilder();
    Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
    List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);

    List<ResultEntry> breakSuggestionList = new ArrayList<>();
    List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
    boolean lastOneProhibited = false;
    boolean lastOneRequired = false;
    boolean lastOneprocedesNewBooleanOp = false;
    for (int i = 0; i < tokenArr.length; i++) {
      boolean prohibited =
        (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) ==
          QueryConverter.PROHIBITED_TERM_FLAG;
      boolean required =
        (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) ==
          QueryConverter.REQUIRED_TERM_FLAG;
      boolean procedesNewBooleanOp =
        (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) ==
          QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
      if (i > 0
          && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
        termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
      }
      lastOneProhibited = prohibited;
      lastOneRequired = required;
      lastOneprocedesNewBooleanOp = procedesNewBooleanOp;

      Term thisTerm = new Term(field, tokenArr[i].toString());
      termArr.add(thisTerm);
      if (breakWords) {
        SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm,
            numSuggestions, ir, options.suggestMode, sortMethod);
        if(breakSuggestions.length==0) {
          noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
        }
        for (SuggestWord[] breakSuggestion : breakSuggestions) {
          sb.delete(0, sb.length());
          boolean firstOne = true;
          int freq = 0;
          for (SuggestWord word : breakSuggestion) {
            if (!firstOne) {
              sb.append(" ");
            }
            firstOne = false;
            sb.append(word.string);
            if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
              freq = Math.max(freq, word.freq);
            } else {
              freq += word.freq;
            }
          }
          breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(),
              freq));
        }
      }
    }
    breakSuggestionList.addAll(noBreakSuggestionList);

    List<ResultEntry> combineSuggestionList = Collections.emptyList();
    CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(
        termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
    if (combineWords) {
      combineSuggestionList = new ArrayList<>(
          combineSuggestions.length);
      for (CombineSuggestion cs : combineSuggestions) {
        int firstTermIndex = cs.originalTermIndexes[0];
        int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
        sb.delete(0, sb.length());
        for (int i = firstTermIndex; i <= lastTermIndex; i++) {
          if (i > firstTermIndex) {
            sb.append(" ");
          }
          sb.append(tokenArr[i].toString());
        }
        Token token = new Token(sb.toString(), tokenArr[firstTermIndex]
            .startOffset(), tokenArr[lastTermIndex].endOffset());
        combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string,
            cs.suggestion.freq));
      }
    }

    // Interleave the two lists of suggestions into one SpellingResult
    SpellingResult result = new SpellingResult();
    Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
    Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
    ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
    ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
    int breakCount = 0;
    int combineCount = 0;
    while (lastBreak != null || lastCombine != null) {
      if (lastBreak == null) {
        addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
        lastCombine = null;
      } else if (lastCombine == null) {
        addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
        lastBreak = null;
      } else if (lastBreak.freq < lastCombine.freq) {
        addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
        lastCombine = null;
      } else if (lastCombine.freq < lastBreak.freq) {
        addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
        lastBreak = null;
      } else if (breakCount >= combineCount) { //TODO: Should reverse >= to < ??S
        addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
        lastCombine = null;
      } else {
        addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
        lastBreak = null;
      }
      if (lastBreak == null && breakIter.hasNext()) {
        lastBreak = breakIter.next();
        breakCount++;
      }
      if (lastCombine == null && combineIter.hasNext()) {
        lastCombine = combineIter.next();
        combineCount++;
      }
    }
    return result;
  }
  private void addToResult(SpellingResult result, Token token, int tokenFrequency, String suggestion, int suggestionFrequency) {
    if(suggestion==null) {
      result.add(token, Collections.<String>emptyList());
      result.addFrequency(token, tokenFrequency);
    } else {
      result.add(token, suggestion, suggestionFrequency);
      result.addFrequency(token, tokenFrequency);
    }
  }

  private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
    String[] words = spacePattern.split(token.toString());
    int result = 0;
    if(sortMethod==BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
      for(String word : words) {
        result = Math.max(result, ir.docFreq(new Term(field, word)));
      }
    } else {
      for(String word : words) {
        result += ir.docFreq(new Term(field, word));
      }
    }
    return result;
  }

  @Override
  public void build(SolrCore core, SolrIndexSearcher searcher) {
  /* no-op */
  }

  @Override
  public void reload(SolrCore core, SolrIndexSearcher searcher)
      throws IOException {
  /* no-op */
  }

  @Override
  public boolean isSuggestionsMayOverlap() {
    return true;
  }
}